Rendering HTML¶

In [ ]:
import plotly
plotly.offline.init_notebook_mode()

Loading the Diabetes Dataset¶

In [ ]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets

data_diabetes = datasets.load_diabetes()
# Converting into dataframe
df_diabetes = pd.DataFrame(data_diabetes.data,columns=data_diabetes.feature_names)
df_diabetes['y'] = data_diabetes.target
df_diabetes.head()
Out[ ]:
age sex bmi bp s1 s2 s3 s4 s5 s6 y
0 0.038076 0.050680 0.061696 0.021872 -0.044223 -0.034821 -0.043401 -0.002592 0.019907 -0.017646 151.0
1 -0.001882 -0.044642 -0.051474 -0.026328 -0.008449 -0.019163 0.074412 -0.039493 -0.068332 -0.092204 75.0
2 0.085299 0.050680 0.044451 -0.005670 -0.045599 -0.034194 -0.032356 -0.002592 0.002861 -0.025930 141.0
3 -0.089063 -0.044642 -0.011595 -0.036656 0.012191 0.024991 -0.036038 0.034309 0.022688 -0.009362 206.0
4 0.005383 -0.044642 -0.036385 0.021872 0.003935 0.015596 0.008142 -0.002592 -0.031988 -0.046641 135.0

Exploratory Data Analysis¶

In [ ]:
df_describe = df_diabetes.describe()
df_describe
Out[ ]:
age sex bmi bp s1 s2 s3 s4 s5 s6 y
count 4.420000e+02 4.420000e+02 4.420000e+02 4.420000e+02 4.420000e+02 4.420000e+02 4.420000e+02 4.420000e+02 4.420000e+02 4.420000e+02 442.000000
mean -2.511817e-19 1.230790e-17 -2.245564e-16 -4.797570e-17 -1.381499e-17 3.918434e-17 -5.777179e-18 -9.042540e-18 9.293722e-17 1.130318e-17 152.133484
std 4.761905e-02 4.761905e-02 4.761905e-02 4.761905e-02 4.761905e-02 4.761905e-02 4.761905e-02 4.761905e-02 4.761905e-02 4.761905e-02 77.093005
min -1.072256e-01 -4.464164e-02 -9.027530e-02 -1.123988e-01 -1.267807e-01 -1.156131e-01 -1.023071e-01 -7.639450e-02 -1.260971e-01 -1.377672e-01 25.000000
25% -3.729927e-02 -4.464164e-02 -3.422907e-02 -3.665608e-02 -3.424784e-02 -3.035840e-02 -3.511716e-02 -3.949338e-02 -3.324559e-02 -3.317903e-02 87.000000
50% 5.383060e-03 -4.464164e-02 -7.283766e-03 -5.670422e-03 -4.320866e-03 -3.819065e-03 -6.584468e-03 -2.592262e-03 -1.947171e-03 -1.077698e-03 140.500000
75% 3.807591e-02 5.068012e-02 3.124802e-02 3.564379e-02 2.835801e-02 2.984439e-02 2.931150e-02 3.430886e-02 3.243232e-02 2.791705e-02 211.500000
max 1.107267e-01 5.068012e-02 1.705552e-01 1.320436e-01 1.539137e-01 1.987880e-01 1.811791e-01 1.852344e-01 1.335973e-01 1.356118e-01 346.000000
  • Dataset contains 442 data.
  • For all the features mean close to 0 and standard deviation close to 1. But for the target variable mean is 152 & standard deviation is 77 which indicates variability.
In [ ]:
df_diabetes_corr = df_diabetes.corr()
print(df_diabetes_corr)
plt.figure(figsize=(12,10))
sns.heatmap(df_diabetes_corr, annot=True)
plt.title('Heatmap of Correlation Matrix of Diabetes Dataset')
plt.show()
          age       sex       bmi        bp        s1        s2        s3  \
age  1.000000  0.173737  0.185085  0.335428  0.260061  0.219243 -0.075181   
sex  0.173737  1.000000  0.088161  0.241010  0.035277  0.142637 -0.379090   
bmi  0.185085  0.088161  1.000000  0.395411  0.249777  0.261170 -0.366811   
bp   0.335428  0.241010  0.395411  1.000000  0.242464  0.185548 -0.178762   
s1   0.260061  0.035277  0.249777  0.242464  1.000000  0.896663  0.051519   
s2   0.219243  0.142637  0.261170  0.185548  0.896663  1.000000 -0.196455   
s3  -0.075181 -0.379090 -0.366811 -0.178762  0.051519 -0.196455  1.000000   
s4   0.203841  0.332115  0.413807  0.257650  0.542207  0.659817 -0.738493   
s5   0.270774  0.149916  0.446157  0.393480  0.515503  0.318357 -0.398577   
s6   0.301731  0.208133  0.388680  0.390430  0.325717  0.290600 -0.273697   
y    0.187889  0.043062  0.586450  0.441482  0.212022  0.174054 -0.394789   

           s4        s5        s6         y  
age  0.203841  0.270774  0.301731  0.187889  
sex  0.332115  0.149916  0.208133  0.043062  
bmi  0.413807  0.446157  0.388680  0.586450  
bp   0.257650  0.393480  0.390430  0.441482  
s1   0.542207  0.515503  0.325717  0.212022  
s2   0.659817  0.318357  0.290600  0.174054  
s3  -0.738493 -0.398577 -0.273697 -0.394789  
s4   1.000000  0.617859  0.417212  0.430453  
s5   0.617859  1.000000  0.464669  0.565883  
s6   0.417212  0.464669  1.000000  0.382483  
y    0.430453  0.565883  0.382483  1.000000  
  • BMI, S5 has high correlation with the y variable when compared to other features.

Cleaning the Data¶

In [ ]:
# Check for missing values
df_missing = df_diabetes.isnull().sum()
df_missing
Out[ ]:
age    0
sex    0
bmi    0
bp     0
s1     0
s2     0
s3     0
s4     0
s5     0
s6     0
y      0
dtype: int64

Splitting Data into Training Data & Test Data¶

In [ ]:
from sklearn.model_selection import train_test_split

X = df_diabetes[['bmi','s5']]
y= df_diabetes['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
  • We are only taking BMI & S5 as features beacuse they have high correlation with y variable.
  • Rest of the features are not using.

Cross Validation¶

In [ ]:
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn import tree
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error,r2_score
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer
# models 

models = {
    # Polynomial regression models
    'poly_2':make_pipeline(PolynomialFeatures(2), LinearRegression()),
    'poly_3':make_pipeline(PolynomialFeatures(3), LinearRegression()),
    # Decision tree models
    'tree_3':tree.DecisionTreeRegressor(max_depth=3),
    'tree_5':tree.DecisionTreeRegressor(max_depth=5),
    # KNN models
    'knn_3':KNeighborsRegressor(n_neighbors=3),
    'knn_5':KNeighborsRegressor(n_neighbors=5),
}

# Using croos validation to evaluate models

# using dictionary to store the scores(multiple metrics)

scoring = {'R2': 'r2',
            'MAE': make_scorer(mean_absolute_error),
            'MAPE': make_scorer(mean_absolute_percentage_error)
            }

# Loop through the models and get the scores
result_scores = {}
for name, model in models.items():
    scores = cross_validate(model, X_train, y_train, cv=5, scoring=scoring)
    result_scores[name] = scores
result_scores
Out[ ]:
{'poly_2': {'fit_time': array([0.00200057, 0.00300074, 0.00299859, 0.00200582, 0.00200057]),
  'score_time': array([0.00099945, 0.00200009, 0.00100136, 0.0009985 , 0.00100017]),
  'test_R2': array([0.43228581, 0.42249479, 0.45512034, 0.45045543, 0.48604324]),
  'test_MAE': array([50.58558101, 45.36980759, 46.13161352, 43.38074686, 47.6912952 ]),
  'test_MAPE': array([0.36562587, 0.33692878, 0.41377779, 0.46593697, 0.48479903])},
 'poly_3': {'fit_time': array([0.00199652, 0.00099921, 0.00100064, 0.00199962, 0.0019989 ]),
  'score_time': array([0.00099802, 0.00100183, 0.00099826, 0.00100255, 0.00100017]),
  'test_R2': array([0.45041869, 0.40973122, 0.46174792, 0.45830304, 0.48175925]),
  'test_MAE': array([49.38501678, 46.05930554, 45.32848821, 43.26921459, 47.49344929]),
  'test_MAPE': array([0.35798695, 0.34411498, 0.40471704, 0.46212987, 0.47836029])},
 'tree_3': {'fit_time': array([0.00200033, 0.00099945, 0.0010004 , 0.00099945, 0.00099945]),
  'score_time': array([0.        , 0.0010016 , 0.        , 0.        , 0.00100017]),
  'test_R2': array([0.43701682, 0.32167862, 0.31959226, 0.37353078, 0.37159155]),
  'test_MAE': array([52.33118645, 47.51945251, 48.4535481 , 46.80048981, 51.84335318]),
  'test_MAPE': array([0.37821915, 0.3387406 , 0.43151498, 0.49029801, 0.5176862 ])},
 'tree_5': {'fit_time': array([0.0010004 , 0.00099802, 0.00099921, 0.00103164, 0.00099468]),
  'score_time': array([0.        , 0.        , 0.00099993, 0.00099254, 0.        ]),
  'test_R2': array([ 0.37452979,  0.14172192, -0.00584287,  0.31735482,  0.33896895]),
  'test_MAE': array([54.42537419, 54.43036146, 60.9255727 , 47.67353586, 51.05476334]),
  'test_MAPE': array([0.39391912, 0.40540775, 0.51201066, 0.46162774, 0.49458176])},
 'knn_3': {'fit_time': array([0.00099969, 0.00099993, 0.        , 0.00101805, 0.00100017]),
  'score_time': array([0.00099969, 0.        , 0.00097847, 0.00100017, 0.00100017]),
  'test_R2': array([0.31750281, 0.23206554, 0.2593057 , 0.47176982, 0.48446127]),
  'test_MAE': array([59.25268817, 50.6827957 , 50.85483871, 42.88172043, 47.81967213]),
  'test_MAPE': array([0.42963723, 0.3830881 , 0.42509481, 0.40206842, 0.42513757])},
 'knn_5': {'fit_time': array([0.00098705, 0.0009923 , 0.0010004 , 0.00099945, 0.        ]),
  'score_time': array([0.00200582, 0.00099683, 0.0010004 , 0.00100017, 0.00100017]),
  'test_R2': array([0.42483855, 0.30885881, 0.34909146, 0.47613233, 0.50973674]),
  'test_MAE': array([54.16129032, 50.4483871 , 47.41612903, 41.58064516, 46.61967213]),
  'test_MAPE': array([0.38140511, 0.38836045, 0.40980985, 0.41520996, 0.43397499])}}

Summmary table for Cross Validation Results¶

In [ ]:
summary_table = {}
for name, score in result_scores.items():
    summary_table[name] = {
        'R2_mean': score['test_R2'].mean(),
        'R2_std': score['test_R2'].std(),
        'MAE_mean': score['test_MAE'].mean(),
        'MAE_std': score['test_MAE'].std(),
        'MAPE_mean': score['test_MAPE'].mean(),
        'MAPE_std': score['test_MAPE'].std()
    }
summary_table = pd.DataFrame(summary_table)
summary_table
Out[ ]:
poly_2 poly_3 tree_3 tree_5 knn_3 knn_5
R2_mean 0.449280 0.452392 0.364682 0.233347 0.353021 0.413732
R2_std 0.021878 0.023701 0.042990 0.144051 0.105879 0.075380
MAE_mean 46.631809 46.307095 49.389606 53.701922 50.298343 48.045225
MAE_std 2.415246 2.055667 2.269404 4.395089 5.325183 4.180914
MAPE_mean 0.413414 0.409462 0.431292 0.453509 0.413005 0.405752
MAPE_std 0.056548 0.053783 0.066825 0.046989 0.017802 0.018962

Identify the best model¶

  • The best model is Polynomial with degreee 3 model. (poly_3).
  • This model has the highest R- Sqaured value and Low values for MAE & MAPE when comapred to other models.

Run Model on test data¶

In [ ]:
models['poly_3'].fit(X_train, y_train)
y_pred = models['poly_3'].predict(X_test)

r2_test = r2_score(y_test, y_pred)
mse_test = mean_absolute_error(y_test, y_pred)
mape_test = mean_absolute_percentage_error(y_test, y_pred)

print('Test Data')
print(f'R2 score: {r2_test}')
print(f'MSE: {mse_test}')
print(f'MAPE: {mape_test}')
Test Data
R2 score: 0.43185766610962883
MSE: 46.2130411464925
MAPE: 0.43000619514776434

Plot graph¶

In [ ]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np


x_surf = np.linspace(df_diabetes.bmi.min(), df_diabetes.bmi.max(), 100)
y_surf = np.linspace(df_diabetes.s5.min(), df_diabetes.s5.max(), 100)
x_surf, y_surf = np.meshgrid(x_surf, y_surf)

# Predictions
int_coff = models['poly_3'].steps[-1][1]

y_pred_new = int_coff.intercept_ + int_coff.coef_[0] * x_surf + int_coff.coef_[1] * y_surf

fig = plt.figure(figsize=(20,10))
### Set figure size
ax = fig.add_subplot(111, projection='3d')
ax.scatter(df_diabetes['bmi'],df_diabetes['s5'],c='red', marker='o', alpha=0.5)
ax.plot_surface(x_surf,y_surf,y_pred_new, color='b', alpha=0.3)
ax.set_xlabel('BMI')
ax.set_ylabel('S5')
ax.set_zlabel('Y')
plt.show()

Conclusion¶

  • Polynomial model with degree 3 is the best model as it has Highest R-Sqaured value and Less MAE & MAPE values.

Model Limitations:

  • For polynomial models when the degree increases risk of overfitting will increase.
  • Polynomial models are sensitive to outliers as they try to fit all the datapints.

Failure

  • Model may fail when the relationship becomes mode complex.